dtable(ss)2 Expression data analysis
2.1 Sample sheet
The correspondence between the methylome and transcriptome names are:
- vector with FULL protein (without doxy): "Tat" (RNAseq [Tat_1, Tat_2, Tat_3]) = "TAT_NO" (microarrays [HC_SEN 3,9,14])
- vector with PARTIAL protein (without doxy):"Tat72" (RNAseq [Tat72_1, Tat72_2, Tat72_3]) = "Exon1_No" (microarrays [HC_SEN 5,11,16])
- vector with FULL protein (with doxy): "TatDOX" (RNAseq [TatDOX_1, TatDOX_2, TatDOX_3]) = "TAT_Dox" (microarrays [HC_SEN 4,10,15])
- vector with PARTIAL protein (with doxy): "Tat72DOX" (RNAseq [Tat72DOX_1, Tat72DOX_2, Tat72DOX_3]) = "Exon1_Dox" (microarrays [HC_SEN 6,12])
- vector without protein (empty): "TetOFF" (RNAseq [TetOFF_1, TetOFF_2, TetOFF_3]) = "Control_NO" (microarrays [HC_SEN 1,7,13])
- vector without protein (empty) + doxy: "TetOFFDOX" (RNAseq [TetOFFDOX_1, TetOFFDOX_2, TetOFF_3]) = "Control_Dox" (microarrays [HC_SEN 2,8])
2.2 Expression:
2.2.1 Counnts:
namedlist <- function(...) {
nl <- list(...)
argnames <- sys.call()
n<-sapply(argnames[-1], as.character)
names(nl)<-n
return(nl)
}
library(readxl)
library(data.table)
counts<-list()
Tat72_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/Tat72VSTetOFF.xlsx", sheet = 1)New names:
• `` -> `...1`
colnames(Tat72_off) <- c("gene",sapply(colnames(Tat72_off),function(x)unlist(strsplit(x,"\\."))[7])[-1])
setDT(Tat72_off)
Tat72_off[,ID :=tstrsplit(gene,"\\.",keep=1)]
Tat_Tat72 <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTat72.xlsx", sheet = 1)New names:
• `` -> `...1`
colnames(Tat_Tat72) <- c("gene",sapply(colnames(Tat_Tat72),function(x)unlist(strsplit(x,"\\."))[7])[-1])
setDT(Tat_Tat72)
Tat_Tat72[,ID :=tstrsplit(gene,"\\.",keep=1)]
Tat_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTetOFF.xlsx", sheet = 1)New names:
• `` -> `...1`
colnames(Tat_off) <- c("gene",sapply(colnames(Tat_off),function(x)unlist(strsplit(basename(x),"\\."))[1])[-1])
setDT(Tat_off)
Tat_off[,ID :=tstrsplit(gene,"\\.",keep=1)]
counts <-namedlist(Tat72_off,Tat_Tat72,Tat_off)dtable(counts[[1]])Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
2.3 DEGs:
library(readxl)
library(data.table)
Tat72_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/Tat72VSTetOFF.xlsx", sheet = 3)
Tat72_off$Contrast="Tat72_off"
Tat_Tat72 <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTat72.xlsx", sheet = 3)
Tat_Tat72$Contrast="Tat_Tat72"
Tat_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTetOFF.xlsx", sheet = 3)
Tat_off$Contrast="Tat_off"dt_DEGs <- data.table::data.table(rbind(Tat_Tat72,Tat72_off,Tat_off))
# dt_DEGs <- dt_DEGs[RealFC<0.5 | RealFC >2,]dtable(dt_DEGs)Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
2.4 Methylation:
“El primer objetivo es ver si la proteína TAT del VIH, induce cambios en el metiloma. La proteína TAT está codificada por 2 exones que realizan funciones diferentes y queremos además aislar el efecto de cada uno, por eso hemos generado unas líneas celulares de jurkat establemente transfectadas con:
- Un vector vacío que será nuestro control: muestras 1, 7 y 13 (triplicados)
- Un vector con la proteína TAT completa: muestras 3, 9 y 14 (triplicados)
- Un vector con solo el primer exón de TAT: muestras 5, 11 y 16 (triplicados)
La comparativa entre estos tres grupos es la primera que nos interesa. a-b, a-c y b-c.”
Por otro lado, la DOXYCICLINA “apaga” la expresión de TAT y el segundo objetivo entonces es ver si el “apagado” de TAT con la DOXY hace algún cambio en el metiloma. Por eso la comparativa aquí sería cada una de las tres líneas anteriores con y sin DOXY, así que tendremos además las siguientes muestras:
- Un vector vacío que será nuestro control + DOXY: muestras 2 y 8 (duplicados)
- Un vector con con la proteína TAT completa + DOXY: muestras 4, 10 y 15 (triplicados)
- Un vector con solo el primer exón de TAT + DOXY: muestras 6 y 12 (duplicados)
La segunda comparativa que nos interesa, por tanto, es. a-d, b-e y c-f.
While the first objective yielded positive results no difference was found for the second (Doxi) apporach.
2.4.1 PCA plot:
PCR plots:
2.4.2 DMRS:
dmrs<-readRDS("data/dmrs.rds")
ano_cols<-c("seqnames","start","end","width","no.cpgs","HMFDR","meandiff","Contrast")
dt_list<- lapply(
1:NROW(dmrs),function(rn){ # For every position or ProbeID do:
UCSC <- data.table( # 1. generate a data.table (per row)
dmrs[rn, # 2. split UCSC gene and position
lapply( # 3. new row for every combination of gene/position
overlapping.genes, # 4. add Contrast porbeID and bval difference
function(x)unlist(strsplit(x,","))
)
,by=ano_cols]
)
})
dt_dmrs <- rbindlist(dt_list)
DMRs <- dt_dmrs[,Gene:=V1][!is.na(V1),]
DMRs$V1 <- NULL
# c1<-DMRs[Contrast== "Control_No-Exon1_No",.(.SD,meandiff=-meandiff,Contrast="Tat72_off"),.SDcols=ano_cols[!ano_cols %in% c("meandiff","Contrast")]]
c1<-DMRs[Contrast== "Control_No-Exon1_No",]
c1$Contrast <- "Tat72_off"
c1$meandiff <- -c1$meandiff
c2<-DMRs[Contrast== "Control_No-TAT_No",]
c2$Contrast <- "Tat_off"
c2$meandiff <- -c2$meandiff
c3<-DMRs[Contrast== "Exon1_No-TAT_No",]
c3$Contrast <- "Tat_Tat72"
c3$meandiff <- -c3$meandiff
DMRs<-rbind(c1,c2,c3)
saveRDS(DMRs,"data/DMRs.rds")dtable(DMRs)2.5 DMRs vs expression
dt_DEG_DMR <- merge(DMRs,dt_DEGs,by=c("Gene","Contrast"),all = T)
tab <- dt_DEG_DMR[,.(DEGs=sum(!is.na(ID)),DMRs=sum(!is.na(start)),overlap=sum(!(is.na(start)) &!(is.na(ID)) )),by=c("Contrast")]
kableExtra::kbl(tab)|>kableExtra::kable_classic_2()| Contrast | DEGs | DMRs | overlap |
|---|---|---|---|
| Tat_Tat72 | 3087 | 1092 | 224 |
| Tat_off | 3102 | 990 | 207 |
| Tat72_off | 1594 | 53 | 8 |
2.5.1 Venn diagrams:
2.5.2 DEG/DMR overlapping Correlations:
Now we want to get the mean beta value for each sample in the dmr location: 1. Find all cpg that fall in the dmr 2. get the mean for each sample 3. Match each sample bVal with it’s expression value. (No way to do it, since there is no key sample to sample) 4. Make correlation 5. Plot.
ol <- dt_DEG_DMR[!(is.na(start)) &!(is.na(ID)),]library(data.table)
library(GenomicRanges)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
locs <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Locations
locs.ranges <- GRanges(locs$chr, IRanges(locs$pos, locs$pos))
names(locs.ranges) <- rownames(locs)
locs.ranges$gene_element <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Group
locs.ranges$gene <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Other$UCSC_RefGene_Name
dt_DEG_DMR_intersect<-lapply(1:NROW(ol),function(nr){
cont<-ol[nr,Contrast]
xpr <- counts[[cont]][ID==ol[nr,ID]]
xpr<-xpr[,.SD,.SDcols=names(xpr)[!(names(xpr)%in%c("ID","gene"))]]
expression <- suppressWarnings(melt(xpr,measure.vars=colnames(xpr),variable.name = "SID",value.name = "TPM" ))
SIDs <- as.character(expression$SID)
cgs <- names(subsetByOverlaps(locs.ranges,GRanges(ol[nr,])))
#### Annotate DMRs with genomic location:
ann<-as.data.table(mcols( subsetByOverlaps(locs.ranges,GRanges(ol[nr,]))))
# Possibility of multiple genes and genomic location per probe
genomic<-lapply(
1:NROW(ann),function(cg){ # For every position or ProbeID do:
UCSC <- data.table( # 1. generate a data.table (per row)
ann[cg, # 2. split UCSC gene and position
lapply( # 3. new row for every combination of gene/position
.SD, # 4. add Contrast porbeID and bval difference
function(x)unlist(strsplit(x,";"))
)
,.SDcols=c("gene_element","gene")]
)
})
# 5. rbind all together:
genomic_loc <- rbindlist(genomic)|> unique() # should get gene annotation from here to be fair
ge <- factor(genomic_loc$gene_element,levels = c("3'UTR","ExonBnd", "1stExon","5'UTR","Body","TSS200","TSS1500"),ordered=T )
gene_location <- ifelse(startsWith( as.character(max(ge)),"TSS" ),"promoter","intragenic")
gene_element <- paste(genomic_loc$gene_element,collapse = ";")
####
b<-betas[probeID %in% cgs ,lapply(.SD,function(x)mean(x,na.rm=T)),.SDcols=SIDs]
meth <- suppressWarnings( melt(b,measure.vars=colnames(b),variable.name = "SID",value.name = "DMR.meth" ))
# list(meth,expression)
m <- merge(meth,expression,by="SID")
m <- m[,lapply(.SD,as.numeric),by=SID]
dt<-cbind(ol[nr,],m)
cor<-cor.test(dt$DMR.meth,dt$TPM)
dt$correlation <- cor$estimate
dt$cor.pval <- cor$p.value
dt$gene_element <- gene_element
dt$gene_location <- gene_location
return(dt)
})|>rbindlist()
dt_DEG_DMR_intersect[,grp:=tstrsplit(SID,"_",keep = 1)]dt_cor <-dt_DEG_DMR_intersect
setorder(dt_cor,cor.pval,Gene)
saveRDS(dt_cor,"data/dt_cor.rds")2.5.2.1 Table for DMR & DEG:
In the following table the values for methylation and expression are integrated:
dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","SID","DMR.meth","TPM","correlation","cor.pval")])2.5.3 Plots
library(ggplot2)
library(data.table)
library(ggpubr)
library(ggrepel)
DMR_DEG_plots <- list()
plots_folder <- "analysis/Integration_with_expression/DEGvsDMR/pval"
dir.create(plots_folder,recursive = T,showWarnings = F)
setkeyv(dt_cor,c("cor.pval","Gene","width"))
for(cont in unique(dt_cor$Contrast)){
dt<-head(dt_cor[Contrast==cont,],60)
lg <- dt[,.(gene=unique(Gene),width=unique(width)),by=cor.pval]
plist <- lapply(1:NROW(lg),function(nr){
pdata <- dt_cor[lg[nr,]]
plot_name <- paste0("/DEG_DMR.pval_", nr, ".png")
plt_path <- paste(plots_folder, cont, sep = .Platform$file.sep)
dir.create(plt_path,recursive = T,showWarnings = F)
p<-ggplot(data=pdata, aes(x=as.numeric(TPM),y=as.numeric(DMR.meth),colour=SID)) +
geom_point(aes(shape=grp),size=8,alpha=0.6) +
xlab("Mean DMR methylation") + ylab("Transcripts x Milion") +
geom_smooth(method = "lm",
inherit.aes = F,
aes(x=as.numeric(pdata$TPM),y=as.numeric(pdata$DMR.meth)),
linetype="dashed",
se=F,
formula = "y ~ x")+
annotate(geom = "text",
x = -Inf, y = Inf,
label=paste0("correlation: ", round(unique(pdata$correlation),4), ", p-value: ", round(unique(pdata$cor.pval),8)),
hjust = 0, vjust = 1
) +
ggtitle(paste0(" Correlation between expression and methylation for gene: ",unique(pdata$Gene)))
ggsave(path = plt_path, filename = plot_name, plot=p)
return(p)
})
DMR_DEG_plots[[cont]] <- plist
}
saveRDS(DMR_DEG_plots,"data/DMR_DEG_plots.rds")2.5.3.1 Tat_Tat72
DMR_DEG_plots[["Tat_Tat72"]]2.5.3.2 Tat_off
DMR_DEG_plots[["Tat_off"]]2.5.3.3 Tat72_off
DMR_DEG_plots[["Tat72_off"]]The pathways analysis will be performed analysing 2 sets of genes: 1. Full DMR/DMPs: the full set of significant diferentially methylated & expressed genes. 2. Cor.sig: A subset of the above where the correlation p.value is smaller than 0.05
2.6 DNA methylation context:
Although the relationship between DNA methylation and gene expression is complex and the mechanisms involved in gene regulation by DNA methylation are diverse. Fortunately, over the years researchers have found some common relationships between gene methylation and gene expression:
DNA methylation is an epigenetic mark that has been traditionally associated with gene silencing, specially when methylation happens on promoter regions. DNA methylation is related with the repressed chromatin state which blocks the access of transcription factors to promoter regions. Thus, silencing promoter activity in a stable way and reducing transcription Suzuki and Bird (2008).
In the other hand, the bodies of active genes in plants and animals are often heavily methylated. Suzuki and Bird (2008)
Therefore, usually it is expected to find most of the genes to have negative correlations between DNA methylation and gene transcription in promoter regions while some of the genes might present positive correlations specially in intra-genic regions.
Now, let’s inspect our data, the following table shows a summary of the paired DMGs and DEGs:
tab.full <- dt_cor[,.(negative.cor=sum(correlation<0)/6,positive.cor=sum(correlation>0)/6,Total=sum(correlation!=0)/6),by=c("Contrast")]
kableExtra::kbl(tab.full)|>kableExtra::kable_classic_2()| Contrast | negative.cor | positive.cor | Total |
|---|---|---|---|
| Tat_off | 84 | 123 | 207 |
| Tat_Tat72 | 106 | 118 | 224 |
| Tat72_off | 3 | 5 | 8 |
::: callout-warning {.column-page-inset} Be aware that DMGs are based on DMRs. More than one DMR can be mapped to the same gene. Therefore, there can be more than one pair of DMG/DEG for the same gene as it happens for LTBP1 gene in Tat72_off contrast.
| Contrast | seqnames | start | end | no.cpgs | width | gene_element | gene_location | Gene |
|---|---|---|---|---|---|---|---|---|
| Tat72_off | chr13 | 67551364 | 67551521 | 3 | 158 | TSS200;Body;Body | promoter | PCDH9 |
| Tat72_off | chr3 | 15311021 | 15311269 | 4 | 249 | Body;5’UTR | intragenic | SH3BP5 |
| Tat72_off | chr5 | 172070871 | 172072282 | 3 | 1412 | Body | intragenic | NEURL1B |
| Tat72_off | chr3 | 116163694 | 116164943 | 13 | 1250 | 5’UTR;1stExon;TSS200;Body;TSS1500 | promoter | LSAMP |
| Tat72_off | chr4 | 157713603 | 157714561 | 3 | 959 | Body | intragenic | PDGFC |
| Tat72_off | chrY | 21877683 | 21878594 | 3 | 912 | Body;ExonBnd | intragenic | KDM5D |
| Tat72_off | chr2 | 33495871 | 33495908 | 3 | 38 | Body | intragenic | LTBP1 |
| Tat72_off | chr2 | 33216742 | 33217280 | 3 | 539 | Body | intragenic | LTBP1 |
In the case of our data there are more positive than negative correlations. You can inspect the data in the following table:
dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","width","SID","DMR.meth","TPM","correlation","cor.pval","gene_element", "gene_location")])2.7 Full
This dataset is not filtered by the strength of the correlation between DMRs and DEGs.
2.7.1 Pathways
path_results<-function(pathway,topN=50,method="method",pval=0.05,path="results/pathways.csv",cols=NULL){
# pathway<-pathway[FDR<1,]
pathway$method<-pathway[[method]]
data.table::setorder(pathway,method,FDR)
sig_idx <- pathway[,.I[FDR < pval] ,by=method]$V1
head_idx<-pathway[,.I[1:min(..topN,.N)],by=c(method,"Contrast")]$V1
res<-pathway[base::union(sig_idx,head_idx),]
# res[,TERM:=ifelse(FDR<pval,paste0("*** ",TERM," ***"),TERM)]
# data.table::fwrite(res,path)
results<-res[,.SD,.SDcols=c("Contrast","FDR",cols,"TERM","method")]
data.table::setorder(results,Contrast,method,FDR)
return(results)
}
library(gprofiler2)
pathways_full <- lapply(unique(dt_cor$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_cor[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full.rds")dtable(Pathway_Full)As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:
2.7.1.1 Positive:
dt_pos <- dt_cor[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive.rds")dtable(Pathway_Full_Pos)2.7.1.2 Negative:
dt_neg <- dt_cor[correlation>0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative.rds")dtable(Pathway_Full_neg)2.8 cor.sig
This dataset is not filtered by the strength of the correlation between DMRs and DEGs.
2.8.1 Pathways
dt.sig <- dt_cor[cor.pval<0.05 ,.(correlation=unique(correlation)),by=c("Contrast","Gene","width")]
library(gprofiler2)
pathways_full <- lapply(unique(dt.sig$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt.sig[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full<-path_results(rbindlist(pathways_full),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full,"data/pathway_full_sig.rds")dtable(Pathway_Full)As you can see the terms are rather general. Let’s try to filter based on the direction of correlation + / -:
2.8.1.1 Positive:
dt_pos <- dt.sig[correlation>0,]
pathways_full_positive <- lapply(unique(dt_pos$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_pos[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_Pos<-path_results(rbindlist(pathways_full_positive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_Pos,"data/pathway_full_positive_sig.rds")dtable(Pathway_Full_Pos)2.8.1.2 Negative:
dt_neg <- dt.sig[correlation>0,]
pathways_full_negitive <- lapply(unique(dt_neg$Contrast),function(cont){
library(gprofiler2)
library(data.table)
p<-gprofiler2::gost(signif = T ,unique(dt_neg[Contrast==cont,Gene]))[[1]]
if(!is.null(p)){
dth<- data.table::as.data.table(p)
dth[,FDR:=p_value]
dth[,TERM:=term_name]
dth[,source:=factor(source)]
dth[,Contrast:=cont]
}
})No results to show
Please make sure that the organism is correct or set significant = FALSE
Pathway_Full_neg<-path_results(rbindlist(pathways_full_negitive),method="source",cols = c("term_size","query_size","intersection_size"))
saveRDS(Pathway_Full_neg,"data/pathway_full_negative_sig.rds")dtable(Pathway_Full_neg):::